Java 程式碼 <<
Previous Next >> C或C++程式碼
Python 程式碼
def parse_content():
"""use bs4 and re module functions to parse content.htm"""
#from pybean import Store, SQLiteWriter
# if no content.db, create database file with cms table
'''
if not os.path.isfile(config_dir+"content.db"):
library = Store(SQLiteWriter(config_dir+"content.db", frozen=False))
cms = library.new("cms")
cms.follow = 0
cms.title = "head 1"
cms.content = "content 1"
cms.memo = "first memo"
library.save(cms)
library.commit()
'''
# if no content.htm, generate a head 1 and content 1 file
if not os.path.isfile(config_dir+"content.htm"):
# create content.htm if there is no content.htm
File = open(config_dir + "content.htm", "w", encoding="utf-8")
File.write("<h1>head 1</h1>content 1")
File.close()
subject = file_get_contents(config_dir+"content.htm")
# deal with content without content
if subject == "":
# create content.htm if there is no content.htm
File = open(config_dir + "content.htm", "w", encoding="utf-8")
File.write("<h1>head 1</h1>content 1")
File.close()
subject = "<h1>head 1</h1>content 1"
# initialize the return lists
head_list = []
level_list = []
page_list = []
# make the soup out of the html content
soup = bs4.BeautifulSoup(subject, 'html.parser')
# 嘗試解讀各種情況下的標題
soup = _remove_h123_attrs(soup)
# 改寫 content.htm 後重新取 subject
with open(config_dir + "content.htm", "wb") as f:
f.write(soup.encode("utf-8"))
subject = file_get_contents(config_dir+"content.htm")
# get all h1, h2, h3 tags into list
htag= soup.find_all(['h1', 'h2', 'h3'])
n = len(htag)
# get the page content to split subject using each h tag
temp_data = subject.split(str(htag[0]))
if len(temp_data) > 2:
subject = str(htag[0]).join(temp_data[1:])
else:
subject = temp_data[1]
if n >1:
# i from 1 to i-1
for i in range(1, len(htag)):
head_list.append(htag[i-1].text.strip())
# use name attribute of h* tag to get h1, h2 or h3
# the number of h1, h2 or h3 is the level of page menu
level_list.append(htag[i-1].name[1])
temp_data = subject.split(str(htag[i]))
if len(temp_data) > 2:
subject = str(htag[i]).join(temp_data[1:])
else:
subject = temp_data[1]
# cut the other page content out of htag from 1 to i-1
cut = temp_data[0]
# add the page content
page_list.append(cut)
# last i
# add the last page title
head_list.append(htag[n-1].text.strip())
# add the last level
level_list.append(htag[n-1].name[1])
temp_data = subject.split(str(htag[n-1]))
# the last subject
subject = temp_data[0]
# cut the last page content out
cut = temp_data[0]
# the last page content
page_list.append(cut)
return head_list, level_list, page_list
Java 程式碼 <<
Previous Next >> C或C++程式碼